{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CART 分类树准确率 0.9600\n"
     ]
    }
   ],
   "source": [
    "# encoding=utf-8\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.datasets import load_iris\n",
    "# 准备数据集\n",
    "iris=load_iris()\n",
    "# 获取特征集和分类标识\n",
    "features = iris.data\n",
    "labels = iris.target\n",
    "# 随机抽取 33% 的数据作为测试集，其余为训练集\n",
    "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)\n",
    "# 创建 CART 分类树\n",
    "clf = DecisionTreeClassifier(criterion='gini')\n",
    "# 拟合构造 CART 分类树\n",
    "clf = clf.fit(train_features, train_labels)\n",
    "# 用 CART 分类树做预测\n",
    "test_predict = clf.predict(test_features)\n",
    "# 预测结果与测试集结果作比对\n",
    "score = accuracy_score(test_labels, test_predict)\n",
    "print(\"CART 分类树准确率 %.4lf\" % score)\n",
    "\n",
    "# 首先 train_test_split 可以帮助我们把数据集抽取一部分作为测试集，这样我们就可以得到训练集和测试集。\n",
    "# 使用 clf = DecisionTreeClassifier(criterion=‘gini’) 初始化一棵 CART 分类树。这样你就可以对 CART 分类树进行训练。\n",
    "# 使用 clf.fit(train_features, train_labels) 函数，将训练集的特征值和分类标识作为参数进行拟合，得到 CART 分类树。\n",
    "# 使用 clf.predict(test_features) 函数进行预测，传入测试集的特征值，可以得到测试结果 test_predict。\n",
    "# 最后使用 accuracy_score(test_labels, test_predict) 函数，传入测试集的预测结果与实际的结果作为参数，得到准确率 score。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'\n 'B' 'LSTAT']\n回归树二乘偏差均值: 17.506467065868264\n回归树绝对值偏差均值: 2.9149700598802393\n"
     ]
    }
   ],
   "source": [
    "# encoding=utf-8\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.datasets import load_boston\n",
    "from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "# 准备数据集\n",
    "boston=load_boston()\n",
    "# 探索数据\n",
    "print(boston.feature_names)\n",
    "# 获取特征集和房价\n",
    "features = boston.data\n",
    "prices = boston.target\n",
    "# 随机抽取 33% 的数据作为测试集，其余为训练集\n",
    "train_features, test_features, train_price, test_price = train_test_split(features, prices, test_size=0.33)\n",
    "# 创建 CART 回归树\n",
    "dtr=DecisionTreeRegressor()\n",
    "# 拟合构造 CART 回归树\n",
    "dtr.fit(train_features, train_price)\n",
    "# 预测测试集中的房价\n",
    "predict_price = dtr.predict(test_features)\n",
    "# 测试集的结果评价\n",
    "print('回归树二乘偏差均值:', mean_squared_error(test_price, predict_price))\n",
    "print('回归树绝对值偏差均值:', mean_absolute_error(test_price, predict_price))\n",
    "\n",
    "# 我们来看下这个例子，首先加载了波士顿房价数据集，得到特征集和房价。然后通过 train_test_split 帮助我们把数据集抽取一部分作为测试集，其余作为训练集。\n",
    "# 使用 dtr=DecisionTreeRegressor() 初始化一棵 CART 回归树。\n",
    "# 使用 dtr.fit(train_features, train_price) 函数，将训练集的特征值和结果作为参数进行拟合，得到 CART 回归树。\n",
    "# 使用 dtr.predict(test_features) 函数进行预测，传入测试集的特征值，可以得到预测结果 predict_price。\n",
    "# 最后我们可以求得这棵回归树的二乘偏差均值，以及绝对值偏差均值。\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 891 entries, 0 to 890\nData columns (total 12 columns):\nPassengerId    891 non-null int64\nSurvived       891 non-null int64\nPclass         891 non-null int64\nName           891 non-null object\nSex            891 non-null object\nAge            714 non-null float64\nSibSp          891 non-null int64\nParch          891 non-null int64\nTicket         891 non-null object\nFare           891 non-null float64\nCabin          204 non-null object\nEmbarked       889 non-null object\ndtypes: float64(2), int64(5), object(5)\nmemory usage: 83.6+ KB\nNone\n------------------------------\n       PassengerId    Survived      Pclass         Age       SibSp  \\\ncount   891.000000  891.000000  891.000000  714.000000  891.000000   \nmean    446.000000    0.383838    2.308642   29.699118    0.523008   \nstd     257.353842    0.486592    0.836071   14.526497    1.102743   \nmin       1.000000    0.000000    1.000000    0.420000    0.000000   \n25%     223.500000    0.000000    2.000000   20.125000    0.000000   \n50%     446.000000    0.000000    3.000000   28.000000    0.000000   \n75%     668.500000    1.000000    3.000000   38.000000    1.000000   \nmax     891.000000    1.000000    3.000000   80.000000    8.000000   \n\n            Parch        Fare  \ncount  891.000000  891.000000  \nmean     0.381594   32.204208  \nstd      0.806057   49.693429  \nmin      0.000000    0.000000  \n25%      0.000000    7.910400  \n50%      0.000000   14.454200  \n75%      0.000000   31.000000  \nmax      6.000000  512.329200  \n------------------------------\n                                Name   Sex    Ticket    Cabin Embarked\ncount                            891   891       891      204      889\nunique                           891     2       681      147        3\ntop     Williams, Mr. Charles Eugene  male  CA. 2343  B96 B98        S\nfreq                               1   577         7        4      644\n------------------------------\n   PassengerId  Survived  Pclass  \\\n0            1         0       3   \n1            2         1       1   \n2            3         1       3   \n3            4         1       1   \n4            5         0       3   \n\n                                                Name     Sex   Age  SibSp  \\\n0                            Braund, Mr. Owen Harris    male  22.0      1   \n1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n2                             Heikkinen, Miss. Laina  female  26.0      0   \n3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n4                           Allen, Mr. William Henry    male  35.0      0   \n\n   Parch            Ticket     Fare Cabin Embarked  \n0      0         A/5 21171   7.2500   NaN        S  \n1      0          PC 17599  71.2833   C85        C  \n2      0  STON/O2. 3101282   7.9250   NaN        S  \n3      0            113803  53.1000  C123        S  \n4      0            373450   8.0500   NaN        S  \n------------------------------\n     PassengerId  Survived  Pclass                                      Name  \\\n886          887         0       2                     Montvila, Rev. Juozas   \n887          888         1       1              Graham, Miss. Margaret Edith   \n888          889         0       3  Johnston, Miss. Catherine Helen \"Carrie\"   \n889          890         1       1                     Behr, Mr. Karl Howell   \n890          891         0       3                       Dooley, Mr. Patrick   \n\n        Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  \n886    male  27.0      0      0      211536  13.00   NaN        S  \n887  female  19.0      0      0      112053  30.00   B42        S  \n888  female   NaN      1      2  W./C. 6607  23.45   NaN        S  \n889    male  26.0      0      0      111369  30.00  C148        C  \n890    male  32.0      0      0      370376   7.75   NaN        Q  \n"
     ]
    }
   ],
   "source": [
    "# 模块 1：数据探索\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "# 数据加载\n",
    "train_data = pd.read_csv('/Users/yuxiang/Documents/GitHub/python-learning/geektime/data_analysis/decision_tree/Titanic_Data/train.csv')\n",
    "test_data = pd.read_csv('/Users/yuxiang/Documents/GitHub/python-learning/geektime/data_analysis/decision_tree/Titanic_Data/test.csv')\n",
    "# 数据探索\n",
    "print(train_data.info())\n",
    "print('-'*30)\n",
    "print(train_data.describe())\n",
    "print('-'*30)\n",
    "print(train_data.describe(include=['O']))\n",
    "print('-'*30)\n",
    "print(train_data.head())\n",
    "print('-'*30)\n",
    "print(train_data.tail())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "S    646\nC    168\nQ     77\nName: Embarked, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# 模块 2：数据清洗\n",
    "# 使用平均年龄来填充年龄中的 nan 值\n",
    "train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)\n",
    "test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)\n",
    "# 使用票价的均值填充票价中的 nan 值\n",
    "train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)\n",
    "test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)\n",
    "\n",
    "print(train_data['Embarked'].value_counts())\n",
    "\n",
    "# 使用登录最多的港口来填充登录港口的 nan 值\n",
    "train_data['Embarked'].fillna('S', inplace=True)\n",
    "test_data['Embarked'].fillna('S',inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']\n"
     ]
    }
   ],
   "source": [
    "# 模块 3：特征选择\n",
    "\n",
    "# 特征选择\n",
    "features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']\n",
    "train_features = train_data[features]\n",
    "train_labels = train_data['Survived']\n",
    "test_features = test_data[features]\n",
    "\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "dvec=DictVectorizer(sparse=False)\n",
    "train_features=dvec.fit_transform(train_features.to_dict(orient='record'))\n",
    "\n",
    "print(dvec.feature_names_)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,\n                       max_features=None, max_leaf_nodes=None,\n                       min_impurity_decrease=0.0, min_impurity_split=None,\n                       min_samples_leaf=1, min_samples_split=2,\n                       min_weight_fraction_leaf=0.0, presort=False,\n                       random_state=None, splitter='best')"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 模块 4：决策树模型\n",
    "\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "# 构造 ID3 决策树\n",
    "clf = DecisionTreeClassifier(criterion='entropy')\n",
    "# 决策树训练\n",
    "clf.fit(train_features, train_labels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'numpy.ndarray' object has no attribute 'to_dict'",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-30-e0dfd29549fe>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 模块 5：模型预测 & 评估\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtest_features\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdvec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_features\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morient\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'record'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;31m# 决策树预测\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mpred_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'to_dict'"
     ],
     "output_type": "error"
    }
   ],
   "source": [
    "# 模块 5：模型预测 & 评估\n",
    "\n",
    "test_features=dvec.transform(test_features.to_dict(orient='record'))\n",
    "# 决策树预测\n",
    "pred_labels = clf.predict(test_features)\n",
    "\n",
    "# 得到决策树准确率\n",
    "acc_decision_tree = round(clf.score(train_features, train_labels), 6)\n",
    "print(u'score 准确率为 %.4lf' % acc_decision_tree)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
